* Fixed regression: It's now possible to specify a custom hostname again
[lhc/web/wiklou.git] / maintenance / generateSitemap.php
1 <?php
2 /**
3 * Creates a Google sitemap for the site
4 *
5 * @package MediaWiki
6 * @subpackage Maintenance
7 *
8 * @copyright Copyright © 2005, Ævar Arnfjörð Bjarmason
9 * @copyright Copyright © 2005, Jens Frank <jeluf@gmx.de>
10 * @copyright Copyright © 2005, Brion Vibber <brion@pobox.com>
11 *
12 * @link http://www.google.com/webmasters/sitemaps/docs/en/about.html
13 * @link http://www.google.com/schemas/sitemap/0.84/sitemap.xsd
14 *
15 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
16 */
17
18 if ( isset( $argv[1] ) )
19 $_SERVER['SERVER_NAME'] = $argv[1];
20
21 $optionsWithArgs = array( 'path' );
22 /* */
23 require_once 'commandLine.inc';
24
25 define( 'GS_MAIN', -2 );
26 define( 'GS_TALK', -1 );
27
28 $gs = new GenerateSitemap( @$options['path'] );
29 $gs->main();
30
31 class GenerateSitemap {
32 /**
33 * The number of entries to save in each sitemap file
34 *
35 * @var int
36 */
37 var $limit;
38
39 /**
40 * Key => value entries of namespaces and their priorities
41 *
42 * @var array
43 */
44 var $priorities = array(
45 // Custom main namespaces
46 GS_MAIN => '0.5',
47 // Custom talk namesspaces
48 GS_TALK => '0.1',
49 // MediaWiki standard namespaces
50 NS_MAIN => '1.0',
51 NS_TALK => '0.1',
52 NS_USER => '0.5',
53 NS_USER_TALK => '0.1',
54 NS_PROJECT => '0.5',
55 NS_PROJECT_TALK => '0.1',
56 NS_IMAGE => '0.5',
57 NS_IMAGE_TALK => '0.1',
58 NS_MEDIAWIKI => '0.0',
59 NS_MEDIAWIKI_TALK => '0.1',
60 NS_TEMPLATE => '0.0',
61 NS_TEMPLATE_TALK => '0.1',
62 NS_HELP => '0.5',
63 NS_HELP_TALK => '0.1',
64 NS_CATEGORY => '0.5',
65 NS_CATEGORY_TALK => '0.1',
66 );
67
68 /**
69 * A one-dimensional array of namespaces in the wiki
70 *
71 * @var array
72 */
73 var $namespaces = array();
74
75 /**
76 * A database slave object
77 *
78 * @var object
79 */
80 var $dbr;
81
82 /**
83 * A resource pointing to the sitemap index file
84 *
85 * @var resource
86 */
87 var $findex;
88
89
90 /**
91 * A resource pointing to a sitemap file
92 *
93 * @var resource
94 */
95 var $file;
96
97 /**
98 * A resource pointing to php://stderr
99 *
100 * @var resource
101 */
102 var $stderr;
103
104 /**
105 * Constructor
106 *
107 * @param string $path The path to prepend to the filenames, used to
108 * save them somewhere else than in the root directory
109 */
110 function GenerateSitemap( $path ) {
111 global $wgDBname;
112
113 $this->path = isset( $path ) ? $path : '';
114 $this->stderr = fopen( 'php://stderr', 'wt' );
115
116 $this->dbr =& wfGetDB( DB_SLAVE );
117 $this->generateNamespaces();
118 $this->generateLimit( NS_MAIN );
119 $this->findex = fopen( "{$this->path}sitemap-index-$wgDBname.xml", 'wb' );
120 }
121
122 /**
123 * Generate a one-dimensional array of existing namespaces
124 */
125 function generateNamespaces() {
126 $fname = 'GenerateSitemap::generateNamespaces';
127
128 $res = $this->dbr->select( 'page',
129 array( 'page_namespace' ),
130 array(),
131 $fname,
132 array(
133 'GROUP BY' => 'page_namespace',
134 'ORDER BY' => 'page_namespace',
135 )
136 );
137
138 while ( $row = $this->dbr->fetchObject( $res ) )
139 $this->namespaces[] = $row->page_namespace;
140 }
141
142 /**
143 * Get the priority of a given namespace
144 *
145 * @param int $namespace The namespace to get the priority for
146 +
147 * @return string
148 */
149
150 function priority( $namespace ) {
151 return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
152 }
153
154 /**
155 * If the namespace isn't listed on the priority list return the
156 * default priority for the namespace, varies depending on whether it's
157 * a talkpage or not.
158 *
159 * @param int $namespace The namespace to get the priority for
160 *
161 * @return string
162 */
163 function guessPriority( $namespace ) {
164 return Namespace::isMain( $namespace ) ? $this->priorities[GS_MAIN] : $this->priorities[GS_TALK];
165 }
166
167 /**
168 * Return a database resolution of all the pages in a given namespace
169 *
170 * @param int $namespace Limit the query to this namespace
171 *
172 * @return resource
173 */
174 function getPageRes( $namespace ) {
175 $fname = 'GenerateSitemap::getPageRes';
176
177 return $this->dbr->select( 'page',
178 array(
179 'page_namespace',
180 'page_title',
181 'page_is_redirect',
182 'page_touched',
183 ),
184 array( 'page_namespace' => $namespace ),
185 $fname
186 );
187 }
188
189 /**
190 * Main loop
191 *
192 * @access public
193 */
194 function main() {
195 global $wgDBname;
196
197 fwrite( $this->findex, $this->openIndex() );
198
199 foreach ( $this->namespaces as $namespace ) {
200 $res = $this->getPageRes( $namespace );
201 $this->file = false;
202 $i = $smcount = 0;
203
204 $this->debug( $namespace );
205 while ( $row = $this->dbr->fetchObject( $res ) ) {
206 if ( $i % $this->limit === 0 ) {
207 if ( $this->file !== false ) {
208 gzwrite( $this->file, $this->closeFile() );
209 gzclose( $this->file );
210 }
211 $this->generateLimit( $namespace );
212 $filename = "sitemap-$wgDBname-NS_$namespace-$smcount.xml.gz";
213 ++$smcount;
214 $this->file = gzopen( $this->path . $filename, 'wb' );
215 gzwrite( $this->file, $this->openFile() );
216 fwrite( $this->findex, $this->indexEntry( $filename ) );
217 $this->debug( "\t$filename" );
218 }
219 ++$i;
220 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
221 $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
222 gzwrite( $this->file, $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) ) );
223 }
224 if ( $this->file ) {
225 gzwrite( $this->file, $this->closeFile() );
226 gzclose( $this->file );
227 }
228 }
229 fwrite( $this->findex, $this->closeIndex() );
230 fclose( $this->findex );
231 }
232
233 /**
234 * Return the XML required to open an XML file
235 *
236 * @static
237 *
238 * @return string
239 */
240 function xmlHead() {
241 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
242 }
243
244 /**
245 * Return the XML schema being used
246 *
247 * @static
248 *
249 * @returns string
250 */
251 function xmlSchema() {
252 return 'http://www.google.com/schemas/sitemap/0.84';
253 }
254
255 /**
256 * Return the XML required to open a sitemap index file
257 *
258 * @return string
259 */
260 function openIndex() {
261 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
262 }
263
264 /**
265 * Return the XML for a single sitemap indexfile entry
266 *
267 * @static
268 *
269 * @param string $filename The filename of the sitemap file
270 *
271 * @return string
272 */
273 function indexEntry( $filename ) {
274 global $wgServer, $wgScriptPath;
275
276 return
277 "\t<sitemap>\n" .
278 "\t\t<loc>$wgServer$wgScriptPath/$filename</log>\n" .
279 "\t</sitemap>\n";
280 }
281
282 /**
283 * Return the XML required to close a sitemap index file
284 *
285 * @static
286 *
287 * @return string
288 */
289 function closeIndex() {
290 return "</sitemapindex>\n";
291 }
292
293 /**
294 * Return the XML required to open a sitemap file
295 *
296 * @return string
297 */
298 function openFile() {
299 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
300 }
301
302 /**
303 * Return the XML for a single sitemap entry
304 *
305 * @static
306 *
307 * @param string $url An RFC 2396 compilant URL
308 * @param string $date A ISO 8601 date
309 * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
310 *
311 r
312 * @return string
313 */
314 function fileEntry( $url, $date, $priority ) {
315 return
316 "\t<url>\n" .
317 "\t\t<loc>$url</loc>\n" .
318 "\t\t<lastmod>$date</lastmod>\n" .
319 "\t\t<priority>$priority</priority>\n" .
320 "\t</url>\n";
321 }
322
323 /**
324 * Return the XML required to close sitemap file
325 *
326 * @static
327 * @return string
328 */
329 function closeFile() {
330 return "</urlset>\n";
331 }
332
333 /**
334 * Write a string to stderr followed by a UNIX newline
335 */
336 function debug( $str ) {
337 fwrite( $this->stderr, "$str\n" );
338 }
339
340 /**
341 * According to the sitemap specification each sitemap must contain no
342 * more than 50,000 urls and no more than 2^20 bytes (10MB), this
343 * function calculates how many urls we can have in each file assuming
344 * that we have the worst case of 63 four byte characters and 1 three
345 * byte character in the title (63*4+1*3 = 255)
346 */
347 function generateLimit( $namespace ) {
348 $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
349
350 $olen = strlen( $this->openFile() );
351 $elen = strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), '1.0' ) );
352 $clen = strlen( $this->closeFile() );
353
354 for ( $i = 1, $etot = $elen; ( $olen + $clen + $etot + $elen ) <= pow( 2, 20 ); ++$i )
355 $etot += $elen;
356
357 $this->limit = $i;
358 }
359 }
360
361 ?>